A STUDY OF BANK CUSTOMER CHURN

¶

Description of each variable:

  • Credit_score — credit rating
  • geography — client country
  • gender — client gender
  • age — client age
  • tenure — number of years spent by the client with the bank
  • balance — client account balance
  • num_of_product — number of products that the client purchased from the bank
  • has_cr_card — the client has a credit card
  • is-_active_member — active client
  • estimated_salary — clien salary
  • exited — client left the bank
In [ ]:
# load libraries
import pandas as pd 
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt   #plt.xkcd()
from IPython.display import set_matplotlib_formats
import matplotlib.ticker as mtick
from sklearn.preprocessing import LabelEncoder
from utilities import *
import scipy.stats as ss
from scipy.stats import ttest_ind
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score as f1, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from itertools import combinations
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
In [ ]:
# Improve quality of the plots
set_matplotlib_formats('svg')
# Set the style of the plot
plt.style.use('seaborn-v0_8-white')
#plt.xkcd()

1-Load and clean the data¶

In [ ]:
# Load the data
df = pd.read_csv("churn.csv", index_col=False)
df.head(5)
Out[ ]:
RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 3 15619304 Onio 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 4 15701354 Boni 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0
  • At least 3 columns do not provide training value : RowNumber, CustomerId and Surname
In [ ]:
col_name = ['credit_score', 'geography', 'gender', 'age', 'tenure', 'balance', 
            'num_of_products', 'has_cr_card', 'is_active_member', 'estimated_salary', 'exited']
In [ ]:
# Remove unnecessary variables
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
# Change the columns name
df.columns = col_name
# Show the data
df.head(5)
Out[ ]:
credit_score geography gender age tenure balance num_of_products has_cr_card is_active_member estimated_salary exited
0 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   credit_score      10000 non-null  int64  
 1   geography         10000 non-null  object 
 2   gender            10000 non-null  object 
 3   age               10000 non-null  int64  
 4   tenure            10000 non-null  int64  
 5   balance           10000 non-null  float64
 6   num_of_products   10000 non-null  int64  
 7   has_cr_card       10000 non-null  int64  
 8   is_active_member  10000 non-null  int64  
 9   estimated_salary  10000 non-null  float64
 10  exited            10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB
  • The set contains information about 10,127 bank users

  • No empty values found

  • There are both nominative and quantitative characteristics

In [ ]:
# Number of unique value for each variable
df.nunique()
Out[ ]:
credit_score         460
geography              3
gender                 2
age                   70
tenure                11
balance             6382
num_of_products        4
has_cr_card            2
is_active_member       2
estimated_salary    9999
exited                 2
dtype: int64

2-Statistics and data visualisation¶

Bar plot of binary variables¶

In [ ]:
df.head(5)
Out[ ]:
credit_score geography gender age tenure balance num_of_products has_cr_card is_active_member estimated_salary exited
0 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0
In [ ]:
plt.figure(figsize=(8, 8))
bar_columns = ["exited", "gender", "has_cr_card", "is_active_member"]
counter = 1
for key in bar_columns:
    plt.subplot(2, 2, counter)
    bar_plot(df[key], x_label=key, y_label="%", color=['indigo', 'gold'])
    counter += 1
No description has been provided for this image
  • Gender and activities among customers are evenly distributed.
  • The proportion of customers holding a credit card is lower.
  • The proportion of exiting customers is low.

Box plot of continuous variables¶

In [ ]:
numeric_col = ['age', 'estimated_salary', 'balance', 'credit_score']
In [ ]:
plt.figure(figsize=(10, 10))
counter = 1
for key in numeric_col:
    plt.subplot(2, 2, counter)
    sns.boxplot(x = df['exited'], y = df[key], palette=['indigo', 'gold'])\
                                                    .set(title=f'{key} by exited')
    counter += 1
No description has been provided for this image

Correlation between numerical variables¶

In [ ]:
# Correlation between numrical variable
plt.figure(figsize=(9, 6))
sns.heatmap(df.drop(['geography', 'gender'], axis=1).corr(), annot=True, cmap='viridis', linewidths=.5)
plt.title('Correlation Plot')
Out[ ]:
Text(0.5, 1.0, 'Correlation Plot')
No description has been provided for this image
  • Based on the chart provided, there is no strong linear relationship observed between the variables.

Association between categorical variables¶

For this purpose, we will use Cramer's V. It measures how strongly two categorical fields are associated.

$$ \mathcal{V} = \sqrt{\frac{\chi^2/n}{min(r-1, c-1)}} $$

In [ ]:
categorical_col = ["num_of_products", "gender", "has_cr_card", "is_active_member", "geography"]
score = {}
exited = df["exited"]
for col in categorical_col:
    temp = df[col]
    the_confusion_matrix = pd.crosstab(exited, temp).to_numpy()
    score[col] = CramerV(the_confusion_matrix, True)

score = pd.Series(score, index=categorical_col).sort_values(ascending=False)
In [ ]:
score.plot(kind="bar", grid=True, color=['#4B0082', '#4B008266', '#4B008299', '#4B0082CC'])
plt.xticks(rotation=50, ha='right')
plt.title('Exited and Categorical')
Out[ ]:
Text(0.5, 1.0, 'Exited and Categorical')
No description has been provided for this image
  • A Cramer's V value of 0.4 indicates a moderate level of association between exited and num_of_products. The association is more substantial compared to the previous value, but it's still not very strong.

  • The remaine variables suggests a relatively weak association with the exited variable.

Conditional probabilities¶

In [ ]:
#--------------------------------------------
# Calculating the continitional probabilities 
#--------------------------------------------
probs = {}
for col in categorical_col:
    uniques = df[col].unique()
    for unique in uniques:
        condition = df[col] == unique
        probs[f'P(Exited|{col}={unique})'] = round(df.loc[condition, 'exited'].mean(), 2)


probs = pd.Series(probs).sort_values(ascending=False)
In [ ]:
#--------------------------------------------
# plot the  probabilities 
#--------------------------------------------
colors = ["#FF0000", "#FF4500", "#FFA500", "#FFD700", "#FFFF00","#ADFF2F", "#7FFF00", "#00FF00", 
          "#00FA9A", "#00CED1", "#4682B4", "#0000FF", "#4B0082"]
probs.plot(kind="bar", grid = True, color = colors)
plt.xticks(rotation=50, ha='right')
plt.title('Probability of Exited given Categorical')
Out[ ]:
Text(0.5, 1.0, 'Probability of Exited given Categorical')
No description has been provided for this image
  • Customers with 4 or 3 products face a higher risk of exiting the bank compared to those with 1 or 2 products.

  • For the other variables, the risk of exiting is significantly low.

Distribution of continuous variables¶

In [ ]:
fig, axes = plt.subplots(2,2, figsize = (9,9))
plt.subplots_adjust(hspace=0.5)
columns = df[numeric_col]
for i, column in enumerate(columns):
    ax = axes[i // 2, i % 2]
    sns.kdeplot(data = df,
                x = column,
                fill = True,
                alpha = 0.5,
                hue = 'exited',
                palette = ['#6A5ACD', '#4B0082'],
                ax = ax)
    ax.set_xlabel(column, fontsize = 14)
plt.show()
No description has been provided for this image
  • Given the distributions above, we observe that the distribution changes when clients exit or stay.
  • We observe a heavy tail in the distributions, which implies higher probabilities for the extreme values.
  • Clients over 70 and below 20 have a higher chance of not exiting..
In [ ]:
plt.figure(figsize=(10, 9))
paired_list_numerical = list(combinations(numeric_col, 2))
counter = 1
for key in paired_list_numerical:
    plt.subplot(3, 2, counter)
    sns.scatterplot(x=key[0], y=key[1], hue="exited", palette=['#6A5ACD', "gold"], data=df) 
    counter += 1
No description has been provided for this image
  • According to the depicted plot, as customer age increases, there appears to be a tendency for the balance to converge towards the median value. Otherwie, there is no significant relationship discernible among the other numerical variables.
In [ ]:
fig, axes = plt.subplots(2,2, figsize = (9,9))
plt.subplots_adjust(hspace=0.5)
columns = df[numeric_col]
for i, column in enumerate(columns):
    ax = axes[i // 2, i % 2]
    sns.kdeplot(data = df,
                x = column,
                fill = True,
                alpha = 0.5,
                hue = 'gender',
                palette = ['#4B0082', '#6A5ACD'],
                ax = ax)
    ax.set_xlabel(column, fontsize = 14)
plt.show()
No description has been provided for this image

We observe slightly higher values in credit score, estimated salary, age, and account balance for males compared to females. This difference is linked to the greater number of male customers as opposed to female customers.

In [ ]:
fig, axes = plt.subplots(2,2, figsize = (10,10))
plt.subplots_adjust(hspace=0.5)
columns = df[numeric_col]
for i, column in enumerate(columns):
    ax = axes[i // 2, i % 2]
    sns.kdeplot(data = df,
                x = column,
                fill = True,
                alpha = 0.5,
                hue = 'geography',
                palette = ['#17b3a8', '#ff0a89', '#4682B4'],
                ax = ax)
    ax.set_xlabel(column, fontsize = 14)
plt.show()
No description has been provided for this image
  • For age, credit score, balance, and estimated salary, France exhibits higher values in the tail, indicating a higher probability of extreme values.

Summary of continuous variables¶

In [ ]:
# Summary of numerical variables
df[numeric_col].describe()
Out[ ]:
age estimated_salary balance credit_score
count 10000.000000 10000.000000 10000.000000 10000.000000
mean 38.921800 100090.239881 76485.889288 650.528800
std 10.487806 57510.492818 62397.405202 96.653299
min 18.000000 11.580000 0.000000 350.000000
25% 32.000000 51002.110000 0.000000 584.000000
50% 37.000000 100193.915000 97198.540000 652.000000
75% 44.000000 149388.247500 127644.240000 718.000000
max 92.000000 199992.480000 250898.090000 850.000000

Mean of continuous variables given the exited variable¶

In [ ]:
ex_cr = df.groupby('exited')['credit_score'].mean().reset_index()
ex_age = df.groupby('exited')['age'].mean().reset_index()
merge_1 = pd.merge(ex_cr, ex_age, on='exited')
ex_salary = df.groupby('exited')['estimated_salary'].mean().reset_index()
merge_2 = pd.merge(merge_1, ex_salary, on='exited')
ex_balance = df.groupby('exited')['estimated_salary'].mean().reset_index()
pd.merge(merge_2, ex_balance, on='exited')
Out[ ]:
exited credit_score age estimated_salary_x estimated_salary_y
0 0 651.853196 37.408389 99738.391772 99738.391772
1 1 645.351497 44.837997 101465.677531 101465.677531

Visually, the conditional mean of continuous variables shows minimal distinctions.

To validate this observation, we will conduct statistical tests.

We will test if

$$ H_0 : \mathbb{E}[X_i|exited = 1] - \mathbb{E}[X_i|exited = 0] = 0 $$

$$ H_1 : \mathbb{E}[X_i|exited = 1] - \mathbb{E}[X_i|exited = 0] \neq 0 $$

$H_0$ : No significant difference between the means

$H_1$ : There is a significant difference between the means

The sample mean : $\bar{Y}_{1} = \frac{1}{n_1}\sum_{i=1}^{n_1}Y_{1, i}$ and $\bar{Y}_{2} = \frac{1}{n_2}\sum_{i=1}^{n_2}$

The sample variance : $S^2_1 = \frac{1}{n_1-1}\sum_{i=1}^{n_1}(Y_{1,i} - \bar{Y}_{1})^2$ and $S^2_2 = \frac{1}{n_2-1}\sum_{i=1}^{n_2}(Y_{2,i} - \bar{Y}_{2})^2$

$S^2_p = \frac{(n_1 - 1)S^2_1 + (n_2 - 1)S^2_2}{n_1 + n_2 - 2}$

$S^2_p$ : is the common variance estimation

We calculate the t value :

$$ t = \frac{\bar{Y}_{1} - \bar{Y}_{2}}{S_p\sqrt{\frac{1}{n_1} + \frac{1}{n_2}}} $$

Finaly, after setting a level $\alpha$, we can calcula the p_value. If the $p-value > \alpha$ we don't reject $H_0$, otherwise, we rejected.

$$ p-value = \mathbb{P}(T > |t|) $$

where

$$ T \sim \mathcal{Student}(n_1 + n_2 -2) $$

In [ ]:
student_test(df, numeric_col, group_by='exited', alpha_level=0.05)
[age, exited] : Reject the null hypothesis with level 0.05; there is a significant difference between groups.
===
[estimated_salary, exited] : Fail to reject the null hypothesis with level 0.05; there is no significant difference between groups.
===
[balance, exited] : Reject the null hypothesis with level 0.05; there is a significant difference between groups.
===
[credit_score, exited] : Reject the null hypothesis with level 0.05; there is a significant difference between groups.
===
  • There is no significant difference in the mean of estimated_salary between individuals who stay and those who exit.

  • For the other variables, according to the t-test results, there is a significant difference.

Mean of continuous variables given the gender¶

In [ ]:
student_test(df, numeric_col, group_by='gender', alpha_level=0.05)
[age, gender] : Reject the null hypothesis with level 0.05; there is a significant difference between groups.
===
[estimated_salary, gender] : Fail to reject the null hypothesis with level 0.05; there is no significant difference between groups.
===
[balance, gender] : Fail to reject the null hypothesis with level 0.05; there is no significant difference between groups.
===
[credit_score, gender] : Fail to reject the null hypothesis with level 0.05; there is no significant difference between groups.
===
  • There is significant difference in the mean of age between male and female.

  • For the other variables, according to the t-test results, there is no significant difference.

Mean of continuous variables given the is active member¶

In [ ]:
student_test(df, numeric_col, group_by='is_active_member', alpha_level=0.05)
[age, is_active_member] : Reject the null hypothesis with level 0.05; there is a significant difference between groups.
===
[estimated_salary, is_active_member] : Fail to reject the null hypothesis with level 0.05; there is no significant difference between groups.
===
[balance, is_active_member] : Fail to reject the null hypothesis with level 0.05; there is no significant difference between groups.
===
[credit_score, is_active_member] : Reject the null hypothesis with level 0.05; there is a significant difference between groups.
===
  • The average age and the credit score of active customers is significantly different.

  • For the other variables, according to the t-test results, there is no significant difference.

3-Implementation of predictive models.¶

Befor deploying models, I'll create binary columns for each category and indicates the presence of the category with a 1 in the corresponding column. This is suitable when there is no ordinal relationship among the categories.

Also standardizing data is an important preprocessing step before applying machine learning algorithms for several reasons because often rely on numerical optimization techniques, such as gradient descent. These algorithms can be sensitive to the scale of input features. Standardizing the data ensures that all features have a mean of 0 and a standard deviation of 1, making the scale consistent across variables.

Replacing categorical by dummies¶

In [ ]:
#--------------------------------------------
# replace by dummies
#--------------------------------------------
df= pd.get_dummies(df, columns=['geography'], prefix="is")
df["gender"] = df["gender"].replace({"Female":1, "Male":0})
exited = df.pop("exited")
df["exited"] = exited
df = df*1
df.head(5)
In [ ]:
#--------------------------------------------
# create df with only numerical columns
#--------------------------------------------
numeric_df = df[numeric_col]
df = df.drop(numeric_df.columns, axis=1)

std_scaler = StandardScaler()
numeric_df = std_scaler.fit_transform(numeric_df)

Prepare and split the data¶

In [ ]:
# std_scaler = StandardScaler()
# numeric_df = std_scaler.fit_transform(numeric_df)
df = pd.concat([df, pd.DataFrame(numeric_df, columns=numeric_col)], axis=1)
df.head(5)
Out[ ]:
gender tenure num_of_products has_cr_card is_active_member is_France is_Germany is_Spain exited age estimated_salary balance credit_score
0 1 2 1 1 1 1 0 0 1 0.293517 0.021886 -1.225848 -0.326221
1 1 1 1 0 1 0 0 1 0 0.198164 0.216534 0.117350 -0.440036
2 1 8 3 1 0 1 0 0 1 0.293517 0.240687 1.333053 -1.536794
3 1 1 2 0 0 1 0 0 0 0.007457 -0.108918 -1.225848 0.501521
4 1 2 1 1 1 0 0 1 0 0.388871 -0.365276 0.785728 2.063884
In [ ]:
#--------------------------------------------
# OverSampling
#--------------------------------------------
X = df.drop("exited", axis=1).values
y = df["exited"].values

sm = SMOTE(random_state=514)
X, y = sm.fit_resample(X, y)
In [ ]:
#--------------------------------------------
# Split the data to train and test
#--------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=514)

In our imbalanced dataset, the occurrences of customers who do not churn significantly outnumber those who do churn. To address this imbalance, we employ oversampling techniques on train set, specifically using SMOTE (Synthetic Minority Over-sampling Technique). This method involves generating synthetic instances of the minority class, helping us balance the number of churn and non-churn instances in our dataset.

In [ ]:
unique, count = np.unique(y_train, return_counts=True)
{k:j for (k,j) in zip(unique, count)}
Out[ ]:
{0: 6407, 1: 6333}

Model deployement¶

In [ ]:
#--------------------------------------------
# Set the models
#--------------------------------------------
models = {
    "RF" : {'model':RandomForestClassifier(),
            'params': {
                'n_estimators' : [50, 75, 100], 
                'max_depth' : [1, 5, 10],
            }
    },


    "GB" : {'model':GradientBoostingClassifier(),
            'params' : {
                'n_estimators' : [50, 75, 100],
                'learning_rate': [0.01, 0.1, 1],
                'max_depth' : [1, 5, 10]
            }
    },

    "XGB": {'model': XGBClassifier(objective='binary:logistic'),
            'params': {
                'n_estimators' : [50, 75, 100],
                'max_depth' : [1, 5, 10],
                'learning_rate': [0.01, 0.1, 1],
            }          
    }
}
In [ ]:
#--------------------------------------------
# Find the best tunning
#--------------------------------------------
score = []

for mod, mp in models.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False, scoring='f1')
    clf.fit(X_train, y_train)
    score.append({
        'model':mod,
        'best_score':clf.best_score_,
        'best_params': clf.best_params_
    })
In [ ]:
pd.DataFrame(score)
Out[ ]:
model best_score best_params
0 RF 0.875978 {'max_depth': 10, 'n_estimators': 75}
1 GB 0.903027 {'learning_rate': 0.1, 'max_depth': 10, 'n_est...
2 XGB 0.897041 {'learning_rate': 0.1, 'max_depth': 10, 'n_est...
In [ ]:
#-----------------------------------------------
# train the best models and predict the test set
#-----------------------------------------------
rf_pipe = Pipeline(steps =[("RF", RandomForestClassifier(n_estimators=100, max_depth=10))])
gb_pipe = Pipeline(steps =[("GB", GradientBoostingClassifier(n_estimators=100, max_depth=10, learning_rate=0.1))])
xgb_pipe = Pipeline(steps =[("XGB", XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1))])

rf_pipe.fit(X_train, y_train)
gb_pipe.fit(X_train, y_train)
xgb_pipe.fit(X_train, y_train)

rf_prediction = rf_pipe.predict(X_test)
gb_prediction = gb_pipe.predict(X_test)
xgb_prediction = xgb_pipe.predict(X_test)
In [ ]:
f1(xgb_prediction, y_test)
Out[ ]:
0.9043862417166298
In [ ]:
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
sns.set_style('darkgrid')
In [ ]:
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Model<b>', '<b>F1 Score On Test Data<b>'],
                                           line_color='darkslategray',
    fill_color='whitesmoke',
    align=['center','center'],
    font=dict(color='black', size=18),
    height=40),
                               
                 cells=dict(values=[['<b>Random Forest<b>', '<b>AdaBoost<b>','<b>SVM<b>'], [np.round(f1(rf_prediction,y_test),2), 
                                                                          np.round(f1(gb_prediction,y_test),2),
                                                                          np.round(f1(xgb_prediction,y_test),2)]]))
                     ])

fig.update_layout(title='Model Results On Test Data')
fig.show()